# -*- coding: utf-8 -*-
"""
@Time    : 2024/7/24 09:58 
@Author  : ZhangShenao 
@File    : line_delimiter_blob_parser.py 
@Desc    : 按行分割的二进制文件解析器
"""
from typing import Iterator

from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document
from langchain_core.documents.base import Blob


class LineDelimiterBlobParser(BaseBlobParser):
    """自定义按行分割的二进制文件解析器"""

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """按行解析二进制文件"""

        # 记录行号
        line_no = 1

        # 按行解析二进制文件
        with blob.as_bytes_io() as f:
            for line in f:
                yield Document(
                    page_content=line.decode("utf-8"),
                    metadata={
                        "source": blob.source,
                        "line_no": line_no,
                    },
                )
                line_no += 1
